********************************************************************************
** 	TITLE: b13_kites_cleaning.do
**
**	PROJECT: IGNITE
** 
**  PURPOSE:Cleans the Kites Data for Court Closure Event Study and Sentiment Analysis

********************************************************************************
set sortseed 13

/**************************************
A. Kites Crosswalk Creation
**************************************/	
	* Load data
		forvalues j=2019/2022{
			if `j'==2019 {
				import delimited "$input_data/kites_11jan2023/Jail population kites 2019.csv", clear
				gen year=2019
				tempfile read_csv`j'
				save `read_csv`j'', replace
				
			}
			else {
				foreach i in jan-june july-dec{
					import delimited "$input_data/kites_11jan2023/Jail population kites `i' `j'.csv", clear
					gen year = `j'
					if "`i'" == "jan-june"{
						local name 1st
					}
					else {
						local name 2nd
					}
					tempfile read_csv`name'`j' 
					save `read_csv`name'`j'', replace
				}
			}
		}

		use `read_csv2019', clear

		forvalues j= 2020/2022{
			append using `read_csv1st`j'', force
			append using `read_csv2nd`j'', force
		}
		
	* Rename variables
		ren (v4 v6 v8 v10 v12 v13 v15 v16 v17 v19 v26 v28 v29) ///
			(request_type request_subtype assigned_group assigned_user ///
			request_number status inmate_name initial_location current_location ///
			inmate_number timestr officer_name message)
	
	* Clean variables
		gen D_original_request = v27 == "ORIGINAL REQUEST:"
		gen D_redirected = v27 == "GROUP ASSIGNED CHANGED"
		gen D_final_officer_response = v27 == "CLOSED:"
		gen D_inmate_response = v27 == "INMATE RESPONSE:"
		gen D_officer_response = v27 == "RESPONSE:"
		gen D_user_assigned_changed = v27 == "USER ASSIGNED CHANGED"
		gen D_draft = v27 == "[DRAFT]"
		
		ren v27 message_type
		replace message_type = subinstr(message_type, ":", "", .)
		
		drop v*
		drop if mi(request_type)
		
		replace initial_location = subinstr(initial_location, "Initial Location: ", "", .)
		replace current_location = subinstr(current_location, "Current Location: ", "", .)
	
		gen date = date(substr(timestr,1,10), "MDY")
		format date %td
		
		gen time = clock(substr(timestr,12,.), "hm")
		format time %tc
		
		gen datetime = clock(timestr,"MDYhm")
		format datetime %tc
		
		global from_inmate (D_original_request==1)
		gen hrs = .
		forvalues j=0/24{
			replace hrs = `j' if 3600000*`j'<time & time<=3600000*(`j'+1)
		}
		
		gen message_lower = lower(message)
		
		gen D_early_rel = strpos(message_lower,"release")>0|strpos(message_lower,"get out")>0 ///
			|strpos(message_lower,"out date")>0|strpos(message_lower,"outdate")>0
		
		gen D_court_date = strpos(message_lower,"court")>0 
			
		gen D_facility = strpos(message_lower,"new path")>0|strpos(message_lower,"newpath")>0 ///
			|strpos(message_lower,"odysse")>0
			
		gen D_personal = strpos(message_lower,"son") | strpos(message_lower,"daughter") ///
			|strpos(message_lower,"brother") | strpos(message_lower,"sister") | strpos(message_lower,"family") ///
			|strpos(message_lower,"visit")| strpos(message_lower,"husband")| strpos(message_lower,"wife")
			
		gen D_when = strpos(message_lower,"when")>0
		gen D_bond = strpos(message_lower,"bond")>0
		gen D_overcrowd = strpos(message_lower,"overcrowd")>0|strpos(message_lower,"over crowd")>0
		
		gen D_officer = !mi(officer_name)
		gen D_imate = max(D_inmate_response,D_original_request)
		

		preserve
			keep inmate_number 
			duplicates drop
			sort  inmate_number 
			set seed 12345 
			gen sortorder = runiform()
			sort sortorder
			gen desensitized_id = _n
			tempfile desensitized_id
			save `desensitized_id'
		restore	
		
		preserve
			keep inmate_number request_number
			duplicates drop inmate_number request_number, force
			merge m:1 inmate_number using `desensitized_id', nogen
			keep desensitized_id request_number
			save "$output_data/kites_crosswalk.dta", replace
		restore
	
/**************************************
B. Structure dataset for court event studies
**************************************/		

	* create a balance panel at the episode level
	use "$output_data/estimate.dta", clear
		format inmate %14s
	
	* sample restriction - following restrictions to the main specification
	keep if booking_date >= td(01jan2019) & booking_date < td(01jun2022)
	keep inmate booking_date release_date
	drop if release_date == .
	duplicates drop
	
	* gen id for the episode
	sort inmate booking_date release_date
	egen episode_id = group(inmate booking_date release_date)
	
	* gen start and end of the episode
	rename booking_date date0 
	rename release_date date1
	
	* reshape long with start and end
	reshape long date, i(episode_id) j(time)
	drop time 
	duplicates drop
	
	bys episode: g id = _n
	bys episode: egen total_id = total(id)
		drop if total_id != 3
		drop id total_id
	
	* keep inmates only that have used kites
	preserve
	use "$input_data/kites_11jan2023/kites_cleaned_messages_identified.dta", clear
	keep inmate
	duplicates drop
	
	tempfile formerge 
	save `formerge'
	restore
	
	merge m:1 inmate using `formerge', keep(3) nogen

	levelsof episode_id, local(epis)
	foreach i of local epis {
	di `i'
qui{
		preserve
		
		* balance panel between boking and release date by episode
		keep if episode == `i'
		
		*xtset 
		xtset episode_id date
		
		* create a balance panel
		tsfill, full 
		
		* fill inmate id in the missings
		gsort -inmate
		replace inmate = inmate[_n-1] if inmate == ""
		
		* tempfile
		tempfile forappend_`i'
		save `forappend_`i''
		restore
}
	}

	** append all temp datasets
	use `forappend_5', clear

	foreach i of local epis {
		append using `forappend_`i''
	}
	duplicates drop
	
	* save a tempfile
	save "$output_data/balanced_panel_temp.dta", replace


/**************************************
C. Preparing Kites data for Sentiments Analysis in R
**************************************/			
	
		* Kites - inmate level
		use "$input_data/kites_11jan2023/kites_cleaned_messages_identified.dta", clear
		keep request_number inmate datetime* response* sender* 
	
		** rehsape long
		reshape long datetime response sender, i(request_number) j(n_mess)
		rename datetime date
		drop if date == .
		drop if sender == 1
				
		* concatened messages in the same date
		sort inmate date, stable
		by inmate date: gen alltext = response[1]
		by inmate date: replace alltext = alltext[_n-1] + " " + response if _n > 1
		by inmate date: replace alltext = alltext[_N]
		by inmate date: keep if _n == _N
		drop response
		rename alltext response		
		
		* save messages before sept 2020
		preserve
		keep if date <= td(01sep2020) 
		keep request_number response
		
		save "$output_data/messages_beforeSept2020.dta", replace
		restore
	
		* save messages after sept 2020
		preserve
		keep if date > td(01sep2020) 
		keep request_number response
		
		save "$output_data/messages_afterSept2020.dta", replace
		restore	
		
		
/**************************************
D. Post-Sentiments Analysis in R: Combine Pre-and-Post IGNITE
**************************************/	

	import delimited "$input_data/kites_11jan2023/sentiment_beforeSept2020.csv", varnames(1) case(preserve) clear
	g post = 0
	
	// append after
	preserve 
	import delimited "$input_data/kites_11jan2023/sentiment_afterSept2020.csv", varnames(1) case(preserve) clear
	g post = 1
	
	tempfile forappend
	save `forappend'
	restore
	
	append using `forappend'
	
	
	local cluster cluster(desensitized_id)
	
	levelsof sentiment, local(sent)
	
	foreach s of local sent {
	    g D_`s' = (sentiment == "`s'")
	}
	drop sentiment
	

	collapse (max) D_*, by(word request_number post)

	gen true_p = (D_positive ==1 & D_negative == 0)

	gen true_n = (D_positive ==0 & D_negative == 1)
	
	gen true_nt = !true_p&!true_n
	
	drop if D_positive&D_negative
	
	merge 1:1 word request_number using "$input_data/kites_11jan2023/word_count.dta", keep(3) nogen
	merge m:1 request_number using "$output_data/kites_crosswalk.dta", keep(3) nogen

	
	save "$output_data/kites_sentiment.dta", replace